; render calling loop routines
; grd, grd tex, point light, flat and all edges only models
draw_thread:        ; in: ebx - thread number; 0 - first thread
    push   ebp
    mov    ebp,esp
    sub    esp,394
    and    ebp,-16
    sub    ebp,128+16  ; to increase amount of short local variables adresses
 ;   include "labs.inc"

   .x_min           equ word [ebp-2]
   .x_max           equ word [ebp-4]
   .y_min           equ      [ebp-6]
   .y_max           equ      [ebp-8]
   .y_max_draw_thrd equ      [ebp-8]
   .zz3             equ      [ebp-12]
   .zz2             equ      [ebp-14]
   .zz1             equ      [ebp-16]
   .pack2           equ      [ebp-32]
                                       ; - 36
   .line_call       equ       [ebp-40]
   .en_draw         equ dword [ebp-44]
   .third_p_index   equ       [ebp-48]
   .second_p_index  equ dword [ebp-52]
   .first_p_index   equ dword [ebp-56]
   .thread_no       equ dword [ebp-60]

   .scr             equ dword [ebp-64] ;  \
   .zbuf            equ       [ebp-68] ;  | > dont change order - line fctions!!
   .line_tex_ptr    equ dword [ebp-72] ;  |           ; cover .in1
   .width           equ       [ebp-76] ;  /           ; cover .in2
   .line_horiz      equ       [ebp-80]                ; cover .in3
   .in1             equ       [ebp-72]
   .in2             equ       [ebp-76]
   .in3             equ       [ebp-80]
   .xy3             equ        ebp-88
   .xy2             equ        ebp-92
   .xy1             equ        ebp-96

   .zer_hgst        equ       [ebp-112]
   .tex_size        equ dword [ebp-116]
   .tex_x4          equ dword [ebp-120]
   .tex_shift       equ dword [ebp-124]
   .hline_call      equ       [ebp-128]


   .correct_texf    equ       [ebp]
   .mask255f        equ       [ebp+16]
   .draw_flag       equ       [ebp+32] ; \  not xchg
   .shadow_flag     equ byte  [ebp+33] ; /  order
   .culling_flag    equ byte  [ebp+34]
                                       ; + 36
   .xres            equ       [ebp+38]
   .yres            equ word  [ebp+40]
                                       ; + 44
   .pack1           equ       [ebp+48] ; min max compare

   .tri_nor_rotated_ptr  equ dword[ebp+64]
   .points_normals_ptr   equ dword[ebp+68]
   .points_count_var     equ dword[ebp+72]
   .triangles_count_var  equ dword[ebp+76]
   .points_r_ptr         equ dword[ebp+80]
   .triangles_ptr        equ dword[ebp+84]
   .por                  equ dword[ebp+88]
   .pnr                  equ dword[ebp+92]
   .edges_ptr            equ dword[ebp+96]
   .edges_count          equ dword[ebp+100]
   .edge_s_d_ptr         equ dword[ebp+104]
   .screen_ptr           equ      [ebp+108]
   .Zbuffer_ptr          equ dword[ebp+112]
   .slices_counter_ptr   equ dword[ebp+116]
   .slices_ptrs_buff_ptr equ dword[ebp+120]
   .tex_points_f_ptr     equ dword[ebp+124]



    mov       .hline_call,dword horizontal_tex_grd_line
    cmp       [s_tex_flag],1
    je        .st
    mov       .tex_shift,TEX_SHIFT
    mov       .tex_x4,TEX_X * 4
    mov       .tex_size,TEXTURE_SIZE
    jmp       .txb
  .st:
    mov       .tex_shift,TEX_SHIFT_S
    mov       .tex_x4,TEX_X_S * 4
    mov       .tex_size,TEXTURE_SIZE_S
 .txb:
    cld
    lea        esi,[triangles_normals_rotated_ptr]
    lea        edi,.tri_nor_rotated_ptr
    mov        ecx,16
    movaps     xmm0,[correct_texf]
    rep        movsd
    xor        ecx,ecx
    cmp        .triangles_count_var,ecx
    jz         .the_end
    mov        .thread_no,ebx
    mov        .en_draw,.end_draw
    movaps     .correct_texf,xmm0
    pcmpeqd    xmm0,xmm0
    psrldq     xmm0,4
    movaps     .zer_hgst,xmm0
    psrld      xmm0,24
    cvtdq2ps   xmm0,xmm0
    movaps     .mask255f,xmm0
    ; init lines 4 pack - edges only models
    mov        dh,[shadow_flag]
    mov        dl,[draw_flag]
    mov        al,[culling_flag]
    movlps     xmm0,.screen_ptr
    shufps     xmm0,xmm0,11110001b
    movlps     .zbuf,xmm0
    ; line call pack of pointers/values
    movlps     xmm1,[xres_vard]
    movss      .xres,xmm1
    mov        .culling_flag,al
;   mov        .shadow_flag,dh
    mov        .draw_flag,dx
    cld
    xorps      xmm4,xmm4
    mov        eax,.thread_no
    pshuflw    xmm1,xmm1,00000001b
    punpcklwd  xmm1,xmm4
    movlps     .y_max,xmm1         
    punpcklwd  xmm1,xmm4
    shufps     xmm1,xmm1,10110001b  ; xm1 lo - hi =  ymin ymax xmin xmax
    cmp        dl,14
    je         .draw_vertices 
    btr        eax,31
    jc        .draw_edges
  @@:
    movzx      ebx,.yres
    shr        ebx,1   ; shr ebx,n ;n=1 - 2 threads, n=2 - 4 threads...
    mov        ecx,ebx
    imul       ebx,eax
    mov        .y_min,bx
    add        ebx,ecx
    mov        .y_max,bx
  .skip_div:
    pcmpeqd    xmm3,xmm3
    psrld      xmm3,16
    movlps     xmm1,.y_max
    movlps     xmm2,.y_min
    andps      xmm1,xmm3
    andps      xmm2,xmm3
    packssdw   xmm1,xmm1
    packssdw   xmm2,xmm2
    shufps     xmm1,xmm1,0
    shufps     xmm2,xmm2,0
    movaps     .pack1,xmm1 ; compare packs to make block
    movaps     .pack2,xmm2 ; cmp of all tris coords easier
    cmp        .shadow_flag,1
    jne        @f
    xorps      xmm3,xmm3
    movups     xmm1,.y_max_draw_thrd
    punpcklwd  xmm1,xmm3
    xorps      xmm2,xmm2
    shufps     xmm1,xmm1,10110001b  ; xm1 lo - hi =  ymin ymax xmin xmax
    call       do_shadow
  @@:
    mov        esi,.triangles_ptr            ; draw triangles
    xor        ecx,ecx
  .draw_triangle:
    push       ecx
    cld
    lodsd
    xchg       eax,edx
    lodsd
    xchg       eax,ebx
    lodsd
    xchg       eax,ebx
    push       esi
;   pushad
    cmp        .culling_flag,1
    je         @f
    imul       ecx,12
    add        ecx,.tri_nor_rotated_ptr
    bt         dword[ecx+8],31
    jnc        .end_draw
  @@:
    shl        edx,2
    shl        eax,2
    shl        ebx,2
    mov        .in1,edx
    mov        .in2,eax
    mov        .in3,ebx
    lea        edx,[edx*3]
    lea        eax,[eax*3]
    lea        ebx,[ebx*3]
    mov        .first_p_index,edx
    mov        .second_p_index,eax
    mov        .third_p_index,ebx
    mov        ecx,.por
    movups     xmm0,[edx+ecx]
    movups     xmm1,[eax+ecx]
    movups     xmm2,[ebx+ecx]
    shufps     xmm0,xmm0,11100001b
    shufps     xmm1,xmm1,11100001b
    shufps     xmm2,xmm2,11100001b
    movaps     xmm4,xmm0
    movhlps    xmm4,xmm1
    shufps     xmm4,xmm4,10000010b
    movhlps    xmm5,xmm2
    movlhps    xmm4,xmm5
    andps      xmm4,.zer_hgst
    cvtps2dq   xmm0,xmm0
    cvtps2dq   xmm1,xmm1
    cvtps2dq   xmm2,xmm2
    packssdw   xmm0,xmm0
    packssdw   xmm1,xmm1
    packssdw   xmm2,xmm2
    punpckldq  xmm0,xmm1
    or         ebx,-1
    movlhps    xmm0,xmm2
    movups     [.xy1],xmm0
                             ; check if at last only fragment
    movaps     xmm3,xmm0     ; is visable
    pcmpgtw    xmm0,.pack1   ; max
    pcmpgtw    xmm3,.pack2   ; min
    shr        ebx,20
    pmovmskb   ecx,xmm0
    pmovmskb   eax,xmm3
    and        ecx,ebx
    and        eax,ebx
    cmp        ecx,ebx
    jz         .end_draw
    or         eax,eax
    jz         .end_draw
    movzx      eax,byte .draw_flag
    add        eax,render_opts
    movzx      edi,byte[eax]
    mov        ecx,.pnr
    mov        eax,.first_p_index
    mov        ebx,.second_p_index
    mov        edx,.third_p_index
    movaps     xmm5,.correct_texf

    movlps     xmm7,[eax+ecx]
    movhps     xmm7,[ebx+ecx]
    movlps     xmm2,[edx+ecx]
    mulps      xmm7,xmm5
    mulps      xmm2,xmm5
    addps      xmm7,xmm5
    addps      xmm2,xmm5
    mov        edx,color_map
    shufps     xmm7,xmm7,10001101b
    push       edx edx edx edx
    cvtps2dq   xmm7,xmm7
    cvtps2dq   xmm2,xmm2
    movups     xmm5,[esp]
    pslld      xmm7,2
    pslld      xmm2,2
    movlhps    xmm3,xmm2  ; xm3 = x1, x2, x3
    shufps     xmm2,xmm2,11000101b
    movhlps    xmm3,xmm7
    movlhps    xmm7,xmm2
    pslld      xmm7,TEX_SHIFT

    paddd      xmm7,xmm3
    paddd      xmm7,xmm5

    movups     [esp],xmm7
    pop        eax  ebx edx ecx
    movlps     xmm0,[eax]
    movlps     xmm1,[ebx]
    movlps     xmm2,[edx]
;    sub        esp,8
;    movlps     [esp],xmm2
;    pop        eax ebx
;    shl        ebx,TEX_SHIFT
;    add        eax,ebx
;    movlps     xmm2,[eax+edx]
    xorps      xmm3,xmm3
    punpcklbw  xmm0,xmm3  ; colors - words in xmm0
    punpcklbw  xmm1,xmm3  ; colors - words in xmm0
    punpcklbw  xmm2,xmm3  ; and xmm2
    movlhps    xmm0,xmm1
    cmp        byte .draw_flag,0
    je         .point_light
    bt         edi,3
    jc         @f
    mov        eax,0x55555555  ; flat cause
    movhlps    xmm3,xmm0
    paddw      xmm2,xmm0
    paddw      xmm2,xmm3
    movd       xmm3,eax
    shufps     xmm3,xmm3,0
    pmulhuw    xmm3,xmm2
    packuswb   xmm3,xmm3        ; flat in xmm3
    jmp        .flat_draw_float
  @@:
  .grd_draw_rp:                ;****Gouraud shading + tex ****
    movlps     xmm5,.y_max_draw_thrd
    xorps      xmm7,xmm7
    punpcklwd  xmm5,xmm7
    shufps     xmm5,xmm5,10110001b

    punpcklwd  xmm0,xmm7
    punpcklwd  xmm1,xmm7
    punpcklwd  xmm2,xmm7
    cvtdq2ps   xmm0,xmm0
    cvtdq2ps   xmm1,xmm1
    cvtdq2ps   xmm2,xmm2
    mov        eax,.in1
    mov        edx,.in2
    mov        ecx,.in3
    mov        ebx,.tex_points_f_ptr
    movlps     xmm6,[2*eax+ebx]
    movhps     xmm6,[2*edx+ebx]
    movlps     xmm3,[2*ecx+ebx]
    mov        eax,[.xy1]
    mov        ebx,[.xy2]
    mov        ecx,[.xy3]
    movups     xmm7,.hline_call   ;esi   ;[hgrdt]   ; gouraud tex
    mov        esi,.zbuf
    mov        edi,.scr
    mov        edx,texmap
    call       glass_tex_tri
    jmp        .en_draw
 .flat_draw_float:
    movaps     xmm0,xmm4
    mov        eax,[.xy1]
    mov        ebx,[.xy2]
    mov        ecx,[.xy3]
    movaps     xmm7,xmm3
    mov        esi,flat_line
    movd       xmm5,esi
    mov        esi,.zbuf
    mov        edi,.scr
    mov        edx,.xres
    movups     xmm1,.y_max_draw_thrd
    call       stencil_tri
    jmp        .en_draw
 .point_light:
    sub        esp,16
    shufps     xmm4,xmm4,11000110b
    movups     [esp],xmm4    ; z pack
    movups     xmm5,[point_light_coords]
    xor        eax,eax
    lea        esi,.first_p_index
  .next_vertex:
    movaps     xmm1,xmm5
    mov        ebx,[esi+2*eax]
    mov        edx,ebx
    add        ebx,.por
    movups     xmm2,[ebx]
    andps      xmm2,.zer_hgst
    subps      xmm1,xmm2
    andps      xmm1,.zer_hgst
    movaps     xmm7,xmm1
    mulps      xmm7,xmm7
    haddps     xmm7,xmm7
    haddps     xmm7,xmm7
    rsqrtps    xmm7,xmm7
    mulps      xmm1,xmm7
    add        edx,.pnr
    movups     xmm3,[edx]
    mulps      xmm3,xmm1
    haddps     xmm3,xmm3     ; sse3 required
    haddps     xmm3,xmm3     ; xmm3 - dot product
    xorps      xmm4,xmm4
    maxps      xmm3,xmm4
    shufps     xmm3,xmm3,11000000b
    movaps     xmm4,xmm3
    mov        ebx,p_light_o
    mov        ecx,5
  @@:
    mulps      xmm4,xmm4
    loop       @b
    movlps     xmm0,[ebx+16] ; light_c_max
    xorps      xmm1,xmm1
    punpcklwd  xmm0,xmm1
    cvtdq2ps   xmm2,xmm0
    mulps      xmm4,xmm2     ; xmm4 reflection(emmisive?) component of light

    movlps     xmm0,[ebx]    ; p_light_o
    punpcklwd  xmm0,xmm1
    cvtdq2ps   xmm2,xmm0
    addps      xmm4,xmm2
    movlps     xmm0,[ebx+8]  ; p_light_min
    punpcklwd  xmm0,xmm1
    cvtdq2ps   xmm2,xmm0
    mulps      xmm2,xmm3
    addps      xmm4,xmm2
    minps      xmm4,.mask255f
    sub        esp,16
    shufps     xmm4,xmm4,11010010b
    movups     [esp],xmm4
    add        eax,2
    cmp        eax,6
    jnz        .next_vertex
    movups     xmm0,[esp]
    movups     xmm1,[esp+16]
    movups     xmm2,[esp+32]
    add        esp,48
    movups     xmm4,[esp]
    add        esp,16
    movlps     xmm5,.y_max_draw_thrd
    xorps      xmm7,xmm7
    punpcklwd  xmm5,xmm7
    shufps     xmm5,xmm5,10110001b
    mov        eax,[.xy3]
    mov        ebx,[.xy2]
    mov        ecx,[.xy1]

    mov        esi,horizontal_tex_grd_line
    mov        edi,.scr
    movd       xmm7,esi ; horizontal line adress
    mov        esi,.zbuf
    call       glass_tex_tri
 .end_draw:
    pop        esi
    pop        ecx
    inc        ecx
    cmp        ecx,.triangles_count_var
    jnz        .draw_triangle
    jmp        .the_end
; edges      ******************************************************
 .draw_edges:
    cmp        .shadow_flag,1
    jne        @f
    call       do_shadow
   @@:
    movzx      ecx,byte .draw_flag
    mov        esi,horizontal_grd
    mov        edi,horizontal_tx
    mov        edx,plain_horizontal
    cmp        cl,13
    cmove      edi,edx   ; plain
    cmp        cl,9
    cmove      edi,esi   ; grd
    mov        .line_call,dword line_grd_tex
    mov        .line_horiz,edi
    mov        edx,envmap
    mov        eax,texmap
    mov        ebx,color_map
    mov        esi,0x0000ff00
    cmp        cl,13
    cmove      edx,esi
    cmp        cl,12
    cmove      edx,eax
    cmp        cl,9         ; .line_tex_ptr
    cmove      edx,ebx
    cmp        cl,11
    mov        .line_tex_ptr ,edx
    movzx      edx,word .xres
    mov        .width,edx
    mov        ecx,.edges_count
    mov        esi,.edges_ptr

  .dr_edge:
    push       ecx
    cld
    lodsd
    xchg       eax,ebx
    lodsd
    xchg       eax,ebx
    push       esi
    cmp        .culling_flag,1
    je         @f
    mov        ecx,eax
    imul       ecx,12
    add        ecx,.pnr
    bt         dword[ecx+8],31
    jnc        .end_dr_ed
  @@:
    push       eax ebx
    imul       ebx,12
    imul       eax,12
    mov        edx,.por
    movlps     xmm4,[ebx+edx]
    movhps     xmm4,[eax+edx]
    cvtps2dq   xmm4,xmm4
    movlps     xmm2,[ebx+edx+8]
    movlps     xmm3,[eax+edx+8]
    mov        edx,.pnr
    movlps     xmm0,[eax+edx]
    movhps     xmm0,[ebx+edx]
    movaps     xmm5,.correct_texf
    mulps      xmm0,xmm5
    addps      xmm0,xmm5
    ; xm0  -  tex env/col coords as float
    cvtps2dq   xmm6,xmm0
    pop        ebx eax
   @@:
    cmp        .draw_flag,byte 12
    jne        @f
    shl        eax,3
    shl        ebx,3
    add        eax,.tex_points_f_ptr
    add        ebx,.tex_points_f_ptr
    movups     xmm0,[eax]
    movups     xmm1,[ebx]
    jmp        .f22
   @@:
    cmp        byte .draw_flag,13
    je         .f22
    sub        esp,16
    movups     [esp],xmm6
    pop        eax ebx ecx edx
    shl        eax,TEX_SHIFT
    shl        ecx,TEX_SHIFT
    add        eax,ebx
    add        ecx,edx
    mov        ebx,.line_tex_ptr  ; color_map
    movlps     xmm0,[4*eax+ebx]
    movlps     xmm1,[4*ecx+ebx]
  ;  xorps      xmm6,xmm6
  ;  punpcklbw  xmm0,xmm6
  ;  punpcklbw  xmm1,xmm6
  ;  punpcklwd  xmm0,xmm6
  ;  punpcklwd  xmm1,xmm6
    pmovzxbd   xmm0,xmm0
    pmovzxbd   xmm1,xmm1
    cvtdq2ps   xmm0,xmm0
    cvtdq2ps   xmm1,xmm1
  .f22:
    sub        esp,16
    movups     [esp],xmm4
    pop        eax ebx ecx edx
    xorps      xmm6,xmm6
    movss      xmm7,dword .xres
    punpcklwd  xmm7,xmm6
    movlhps    xmm7,xmm7
    movups     xmm5,.width
    mov        edi,.line_horiz
    call       dword .line_call
   .end_dr_ed:
    pop        esi
    pop        ecx
    dec        ecx
    jnz        .dr_edge
 ;   loop    .dr_edge
    jmp        .the_end
  .draw_vertices:
    cmp        .shadow_flag,1
    jne        @f
    call       do_shadow
   @@:
    mov        esi,.por
    mov        ecx,.points_count_var
    movzx      edx,word .xres
    or         eax,-1
    xorps      xmm3,xmm3
    movd       xmm5,.xres
    punpcklwd  xmm5,xmm3
    cld
  .again_vert:
    movups     xmm0,[esi]
    cvtps2dq   xmm0,xmm0
    xorps      xmm3,xmm3
    movaps     xmm4,xmm5
    pcmpgtd    xmm4,xmm0
    pcmpgtd    xmm3,xmm0
    xorps      xmm3,xmm4
    pmovmskb   ebx,xmm3
    cmp        bl,0xff
    jne        @f
    sub        esp,8
    movlps     [esp],xmm0
    pop        ebx edi
    imul       edi,edx
    add        edi,ebx
    shl        edi,2
    add        edi,.scr
    stosd
   @@:                         ;  skip vertex
    add        esi,12
    loop       .again_vert
 .the_end:
    add        esp,394
    pop        ebp
ret
